notebook.community

Edit and run



In [1]:

    
import pandas as pd



In [2]:

    
sales = pd.DataFrame(
 {
 'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
 'bread': [139, 237, 326, 456],
 'butter': [20, 45, 70, 98]
 }
)



In [3]:

    
sales.groupby('weekday').count()



In [4]:

    
sales.groupby('weekday')['city'].count()









    Out[4]:





weekday
Mon    2
Sun    2
Name: city, dtype: int64



In [5]:

    
sales.groupby('weekday')[['city','butter']].count()



In [6]:

    
sales.groupby(['weekday','city'])['butter'].mean()









    Out[6]:





weekday  city  
Mon      Austin    70
         Dallas    98
Sun      Austin    20
         Dallas    45
Name: butter, dtype: int64



In [7]:

    
#Groupby by custom series
customers = pd.Series(['Dave','Alice','Bob','Alice'])
sales.groupby(customers)['bread'].sum()









    Out[7]:





Alice    693
Bob      326
Dave     139
Name: bread, dtype: int64



In [8]:

    
#Groupby is more efficient (less memory and more speed) with categorical column
sales.dtypes









    Out[8]:





bread       int64
butter      int64
city       object
weekday    object
dtype: object



In [9]:

    
sales['weekday'] = sales['weekday'].astype('category')
sales.dtypes









    Out[9]:





bread         int64
butter        int64
city         object
weekday    category
dtype: object



In [10]:

    
sales.groupby('weekday').sum()

Group by and aggregation



In [11]:

    
sales.groupby('city')[['bread','butter']].agg(['max','sum'])



In [12]:

    
# custom aggregation
def data_range(series):
    return series.max() - series.min()

sales.groupby('weekday')[['bread', 'butter']].agg(data_range)



In [13]:

    
sales.groupby(customers)[['bread', 'butter']].agg({'bread':'sum', 'butter':data_range})

Group by and Transformation



In [14]:

    
def zscore(series):
    return (series - series.mean()) / series.std()



In [15]:

    
#!pip install pydataset
from pydataset import data



In [16]:

    
boston = data('Boston')
boston['id']=boston.index
boston.head()



In [27]:

    
zscore(boston['rm']).head()









    Out[27]:





1    0.413263
2    0.194082
3    1.281446
4    1.015298
5    1.227362
Name: rm, dtype: float64



In [28]:

    
len(zscore(boston['rm']))









    Out[28]:





506



In [29]:

    
boston.groupby('rad')['rm'].transform(zscore).head()









    Out[29]:





1    0.036211
2   -0.324529
3    0.758377
4    0.813555
5    1.044462
Name: rm, dtype: float64



In [30]:

    
len(boston.groupby('rad')['rm'].transform(zscore))









    Out[30]:





506

Apply transformation and aggregation



In [31]:

    
def zscore_with_indus_and_id(group):
    df = pd.DataFrame(
        {'rm': zscore(group['rm']),
         'rad': group['rad'],
         'id': group['id']})
    return df



In [33]:

    
boston.groupby('rad').apply(zscore_with_indus_and_id).head()

Groupby Object



In [34]:

    
splitting = boston.groupby('rad')
print(type(splitting))
print(type(splitting.groups))
print(splitting.groups.keys())









    



<class 'pandas.core.groupby.DataFrameGroupBy'>
<class 'dict'>
dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 24])



In [38]:

    
splitting.groups









    Out[38]:





{1: Int64Index([  1, 194, 195, 255, 256, 284, 285, 286, 287, 332, 333, 342, 343,
             350, 351, 502, 503, 504, 505, 506],
            dtype='int64'),
 2: Int64Index([  2,   3,  57,  89,  90,  91,  92,  96,  97,  98,  99, 100, 121,
             122, 123, 124, 125, 126, 127, 197, 198, 199, 202, 203],
            dtype='int64'),
 3: Int64Index([  4,   5,   6,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,
              50,  55,  65,  85,  86,  87,  88, 180, 181, 182, 183, 184, 185,
             186, 187, 200, 201, 257, 270, 271, 272, 273, 274, 346, 347],
            dtype='int64'),
 4: Int64Index([ 14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
             ...
             349, 352, 353, 355, 356, 489, 490, 491, 492, 493],
            dtype='int64', length=110),
 5: Int64Index([  7,   8,   9,  10,  11,  12,  13,  36,  37,  38,
             ...
             335, 336, 337, 338, 339, 340, 341, 344, 345, 354],
            dtype='int64', length=115),
 6: Int64Index([112, 113, 114, 115, 116, 117, 118, 119, 120, 239, 240, 241, 242,
             243, 244, 288, 289, 290, 494, 495, 496, 497, 498, 499, 500, 501],
            dtype='int64'),
 7: Int64Index([245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 302, 303, 304,
             305, 306, 307, 308],
            dtype='int64'),
 8: Int64Index([ 59,  60,  61,  62,  63,  64, 221, 222, 223, 224, 225, 226, 227,
             228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238],
            dtype='int64'),
 24: Int64Index([357, 358, 359, 360, 361, 362, 363, 364, 365, 366,
             ...
             479, 480, 481, 482, 483, 484, 485, 486, 487, 488],
            dtype='int64', length=132)}



In [42]:

    
### Group object iteration
for group_name, group in splitting:
    avg= group['age'].mean()
    print(group_name,avg)









    



1 45.025
2 64.77083333333333
3 49.31052631578947
4 60.84181818181818
5 69.22347826086958
6 60.14230769230768
7 40.14117647058823
8 67.35
24 89.80530303030304



In [43]:

    
boston.groupby('rad')['age'].mean()









    Out[43]:





rad
1     45.025000
2     64.770833
3     49.310526
4     60.841818
5     69.223478
6     60.142308
7     40.141176
8     67.350000
24    89.805303
Name: age, dtype: float64

Group object: iteration and filtering



In [57]:

    
for group_name, group in splitting:
    avg= group.loc[group['zn']==0,'age'].mean()
    print(group_name,avg)









    



1 77.76666666666667
2 75.31666666666666
3 55.20384615384614
4 72.01298701298701
5 73.86410256410257
6 71.94705882352942
7 nan
8 70.5111111111111
24 89.80530303030304



In [60]:

    
{zn: group.loc[group['zn']==0,'age'].mean() for zn, group in splitting}









    Out[60]:





{1: 77.76666666666667,
 2: 75.31666666666666,
 3: 55.20384615384614,
 4: 72.01298701298701,
 5: 73.86410256410257,
 6: 71.94705882352942,
 7: nan,
 8: 70.5111111111111,
 24: 89.80530303030304}

Boolean groupby



In [65]:

    
filt= boston['zn']==0
res=boston.groupby(['rad',filt])['age'].mean()
res









    Out[65]:





rad  zn   
1    False    30.992857
     True     77.766667
2    False    33.133333
     True     75.316667
3    False    36.541667
     True     55.203846
4    False    34.775758
     True     72.012987
5    False    59.440541
     True     73.864103
6    False    37.844444
     True     71.947059
7    False    40.141176
8    False    57.866667
     True     70.511111
24   True     89.805303
Name: age, dtype: float64



In [69]:

    
res.unstack(-1)



In [ ]:

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
1	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
2	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
3	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
4	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
5	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2

zn	False	True
rad
1	30.992857	77.766667
2	33.133333	75.316667
3	36.541667	55.203846
4	34.775758	72.012987
5	59.440541	73.864103
6	37.844444	71.947059
7	40.141176	NaN
8	57.866667	70.511111
24	NaN	89.805303